{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from sklearn.externals import joblib\n",
    "%matplotlib inline\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DIR = 'PATH/TO/YOUR/DATA'\n",
    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
    "previous_application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/previous_application.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "previous_application.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Aggregations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = []\n",
    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
    "    for select in ['AMT_ANNUITY',\n",
    "                   'AMT_APPLICATION',\n",
    "                   'AMT_CREDIT',\n",
    "                   'AMT_DOWN_PAYMENT',\n",
    "                   'AMT_GOODS_PRICE',\n",
    "                   'CNT_PAYMENT',\n",
    "                   'DAYS_DECISION',\n",
    "                   'HOUR_APPR_PROCESS_START',\n",
    "                   'RATE_DOWN_PAYMENT'\n",
    "                   ]:\n",
    "        PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append((select, agg))\n",
    "PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], PREVIOUS_APPLICATION_AGGREGATION_RECIPIES)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "groupby_aggregate_names = []\n",
    "for groupby_cols, specs in tqdm(PREVIOUS_APPLICATION_AGGREGATION_RECIPIES):\n",
    "    group_object = previous_application.groupby(groupby_cols)\n",
    "    for select, agg in tqdm(specs):\n",
    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
    "        application = application.merge(group_object[select]\n",
    "                              .agg(agg)\n",
    "                              .reset_index()\n",
    "                              .rename(index=str,\n",
    "                                      columns={select: groupby_aggregate_name})\n",
    "                              [groupby_cols + [groupby_aggregate_name]],\n",
    "                              on=groupby_cols,\n",
    "                              how='left')\n",
    "        groupby_aggregate_names.append(groupby_aggregate_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
    "application_agg_corr = abs(application_agg.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solution 4\n",
    "### Hand crafted features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "numbers_of_applications = [1, 3, 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = pd.DataFrame({'SK_ID_CURR': previous_application['SK_ID_CURR'].unique()})\n",
    "prev_applications_sorted = previous_application.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()\n",
    "group_object.rename(index=str,\n",
    "                    columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'},\n",
    "                    inplace=True)\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prev_applications_sorted['previous_application_prev_was_approved'] = (\n",
    "        prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')\n",
    "group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[\n",
    "    'previous_application_prev_was_approved'].last().reset_index()\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prev_applications_sorted['previous_application_prev_was_refused'] = (\n",
    "        prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')\n",
    "group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[\n",
    "    'previous_application_prev_was_refused'].last().reset_index()\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for number in numbers_of_applications:\n",
    "    prev_applications_tail = prev_applications_sorted.groupby(by=['SK_ID_CURR']).tail(number)\n",
    "\n",
    "    group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index()\n",
    "    group_object.rename(index=str, columns={\n",
    "        'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},\n",
    "                        inplace=True)\n",
    "    features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "\n",
    "    group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index()\n",
    "    group_object.rename(index=str, columns={\n",
    "        'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(number)},\n",
    "                        inplace=True)\n",
    "    features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "\n",
    "    group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_FIRST_DRAWING'].mean().reset_index()\n",
    "    group_object.rename(index=str, columns={\n",
    "        'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(number)},\n",
    "                        inplace=True)\n",
    "    features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features,\n",
    "                                left_on=['SK_ID_CURR'],\n",
    "                                right_on=['SK_ID_CURR'],\n",
    "                                how='left',\n",
    "                                validate='one_to_one')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "engineered_numerical_columns = list(features.columns)\n",
    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
    "credit_eng = X[engineered_numerical_columns + ['TARGET']]\n",
    "credit_eng_corr = abs(credit_eng.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(credit_eng_corr, \n",
    "            xticklabels=credit_eng_corr.columns,\n",
    "            yticklabels=credit_eng_corr.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solution 6\n",
    "\n",
    "### Hand crafted features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = pd.DataFrame({'SK_ID_CURR': previous_application['SK_ID_CURR'].unique()})\n",
    "prev_app_sorted = previous_application.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prev_app_sorted['approved'] = (prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')\n",
    "prev_app_sorted['refused'] = (prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')\n",
    "prev_app_sorted['revolving_loan'] = (prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int')\n",
    "prev_app_sorted['cash_loan'] = (prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Cash loans').astype('int')\n",
    "prev_app_sorted['consumer_loan'] = (prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Consumer loans').astype('int')\n",
    "prev_app_sorted['credit_goods_diff'] = prev_app_sorted['AMT_CREDIT'] - prev_app_sorted['AMT_GOODS_PRICE']\n",
    "prev_app_sorted['credit_goods_ratio'] = prev_app_sorted['AMT_CREDIT'] / prev_app_sorted['AMT_GOODS_PRICE']\n",
    "prev_app_sorted['application_credit_diff'] = prev_app_sorted['AMT_APPLICATION'] - prev_app_sorted['AMT_CREDIT']\n",
    "prev_app_sorted['application_credit_ratio'] = prev_app_sorted['AMT_APPLICATION'] / prev_app_sorted['AMT_CREDIT']\n",
    "prev_app_sorted['NAME_PRODUCT_TYPE_x_sell'] = (prev_app_sorted['NAME_PRODUCT_TYPE'] == 'x-sell').astype('int')\n",
    "prev_app_sorted['NAME_PRODUCT_TYPE_walk_in'] = (prev_app_sorted['NAME_PRODUCT_TYPE'] == 'walk-in').astype('int')\n",
    "prev_app_sorted['NAME_PAYMENT_TYPE_bank'] = (prev_app_sorted['NAME_PAYMENT_TYPE'] == 'Cash through the bank').astype('int')\n",
    "prev_app_sorted['NAME_PAYMENT_TYPE_account'] = (prev_app_sorted['NAME_PAYMENT_TYPE'] == 'Non-cash from your account').astype('int')\n",
    "prev_app_sorted['NAME_PAYMENT_TYPE_employer'] = (prev_app_sorted['NAME_PAYMENT_TYPE'] == 'Cashless from the account of the employer').astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prev_app_sorted_groupby = prev_app_sorted.groupby(by=['SK_ID_CURR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = prev_app_sorted_groupby['SK_ID_PREV'].agg('nunique').reset_index()\n",
    "g.rename(index=str, columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'}, inplace=True)\n",
    "features = features.merge(g, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = prev_app_sorted_groupby['refused'].mean().reset_index()\n",
    "g.rename(index=str, columns={'refused': 'previous_application_fraction_of_refused_applications'}, inplace=True)\n",
    "features = features.merge(g, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _get_last_k_applications_feature_name(feature_name, number, suffix):\n",
    "    return 'previous_application_{}_last_{}_applications_{}'.format(feature_name, number, suffix)\n",
    "\n",
    "\n",
    "def get_last_k_credits_features(prev_app_sorted, numbers_of_applications):\n",
    "    features = pd.DataFrame({'SK_ID_CURR': prev_app_sorted['SK_ID_CURR'].unique()})\n",
    "\n",
    "    feature_list = ['CNT_PAYMENT', 'DAYS_DECISION', 'DAYS_FIRST_DRAWING', 'NFLAG_INSURED_ON_APPROVAL',\n",
    "                        'NAME_PRODUCT_TYPE_x_sell', 'NAME_PRODUCT_TYPE_walk_in', 'NAME_PAYMENT_TYPE_bank',\n",
    "                        'NAME_PAYMENT_TYPE_account', 'NAME_PAYMENT_TYPE_employer', 'approved', 'refused',\n",
    "                        'revolving_loan', 'cash_loan', 'consumer_loan', 'credit_goods_diff', 'credit_goods_ratio',\n",
    "                        'application_credit_diff', 'application_credit_ratio']\n",
    "\n",
    "    for number in numbers_of_applications:\n",
    "        prev_applications_tail = prev_app_sorted.groupby('SK_ID_CURR').tail(number)\n",
    "        tail_groupby = prev_applications_tail.groupby('SK_ID_CURR')\n",
    "        g = tail_groupby[feature_list].agg('mean')\n",
    "\n",
    "        g = g.rename(axis='columns', mapper=partial(_get_last_k_applications_feature_name,\n",
    "                                                        number=number, suffix='mean')).reset_index()\n",
    "        features = features.merge(g, how='left', on='SK_ID_CURR')\n",
    "\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "\n",
    "g = get_last_k_credits_features(prev_app_sorted, numbers_of_applications=numbers_of_applications)\n",
    "features = features.merge(g, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features,\n",
    "                                left_on=['SK_ID_CURR'],\n",
    "                                right_on=['SK_ID_CURR'],\n",
    "                                how='left',\n",
    "                                validate='one_to_one')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "engineered_numerical_columns = list(features.columns)\n",
    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
    "credit_eng = X[engineered_numerical_columns + ['TARGET']]\n",
    "credit_eng_corr = abs(credit_eng.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15, 15))\n",
    "sns.heatmap(credit_eng_corr, \n",
    "            xticklabels=credit_eng_corr.columns,\n",
    "            yticklabels=credit_eng_corr.columns)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
